library(Biostrings)
library(tidyverse)
library(ggplot2)
library("ggbeeswarm") # avoid overlapping points
# library(patchwork)
head( methods(class = "DNAStringSet") )
## [1] "!,List-method" "!=,ANY,Vector-method"
## [3] "!=,Vector,ANY-method" "!=,Vector,Vector-method"
## [5] "[,List-method" "[[,List-method"
methods(generic = "reverseComplement")
## [1] reverseComplement,DNAString-method
## [2] reverseComplement,DNAStringSet-method
## [3] reverseComplement,MaskedDNAString-method
## [4] reverseComplement,MaskedRNAString-method
## [5] reverseComplement,matrix-method
## [6] reverseComplement,QualityScaledDNAStringSet-method
## [7] reverseComplement,QualityScaledRNAStringSet-method
## [8] reverseComplement,RNAString-method
## [9] reverseComplement,RNAStringSet-method
## [10] reverseComplement,XStringViews-method
## see '?methods' for accessing help and source code
Examples: * Inference of gene locations * reads alignments to quantify gene expression * infer location of chipseq reads (location of the TF binding site to the closest gene)
## Loading required package: GenomeInfoDb
Grange: class. includes genomic locations & associations * seqnames: could be chromosomes, contigs, … * star (*) means unknown strand
# + = plus strand
exon = GRanges(c('chr1:20-30:+', 'chr1:40-50:+', "chr1:45-55"))
exon
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 20-30 +
## [2] chr1 40-50 +
## [3] chr1 45-55 *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
exon = GRanges(c('chr1:20-30:+', 'chr1:40-50:+', "chr1:45-55:+"))
exon is a vector (like DNAstring)
length(exon)
## [1] 3
start coordinate
start(exon)
## [1] 20 40 45
end(exon)
## [1] 30 50 55
Implementation of bioconductor assumes that intervals are closed intervals (in the operations) * e.g. 40 to 50: 50 - 40 + 1 = 11
width(exon)
## [1] 11 11 11
Collapse all exons regions
reduce(exon)
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 20-30 +
## [2] chr1 40-55 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Disjoint range
disjoin(exon)
## GRanges object with 4 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 20-30 +
## [2] chr1 40-44 +
## [3] chr1 45-50 +
## [4] chr1 51-55 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
promoters(exon, upstream=2000, downstream=3000)
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 -1980-3019 +
## [2] chr1 -1960-3039 +
## [3] chr1 -1955-3044 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Project reads onto the genome * Interpretation: 19 zeros, 11 ones, 9 zeros, 5 ones, 6 twos, 5 ones * (see representation in next chunk for mvisual representation)
coverage(exon)
## RleList of length 1
## $chr1
## integer-Rle of length 55 with 6 runs
## Lengths: 19 11 9 5 6 5
## Values : 0 1 0 1 2 1
coverage(exon) %>% unlist() %>% as.integer()
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0
## [36] 0 0 0 0 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1
Coerce it as granges
cvg <- coverage(exon)
cvg <- cvg %>% as('GRanges')
cvg
## GRanges object with 6 ranges and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 1-19 * | 0
## [2] chr1 20-30 * | 1
## [3] chr1 31-39 * | 0
## [4] chr1 40-44 * | 1
## [5] chr1 45-50 * | 2
## [6] chr1 51-55 * | 1
## -------
## seqinfo: 1 sequence from an unspecified genome
granges(cvg)
## GRanges object with 6 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 1-19 *
## [2] chr1 20-30 *
## [3] chr1 31-39 *
## [4] chr1 40-44 *
## [5] chr1 45-50 *
## [6] chr1 51-55 *
## -------
## seqinfo: 1 sequence from an unspecified genome
You can add elements to this dataframe
cvg$score <- c(1,0,0,4,3,5)
cvg
## GRanges object with 6 ranges and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr1 1-19 * | 1
## [2] chr1 20-30 * | 0
## [3] chr1 31-39 * | 0
## [4] chr1 40-44 * | 4
## [5] chr1 45-50 * | 3
## [6] chr1 51-55 * | 5
## -------
## seqinfo: 1 sequence from an unspecified genome
mcols(cvg)
## DataFrame with 6 rows and 1 column
## score
## <numeric>
## 1 1
## 2 0
## 3 0
## 4 4
## 5 3
## 6 5
df <- DataFrame(
i = 1:3,
dna = DNAStringSet(c('AAA','CCC','GGG')),
gr = exon
)
df
## DataFrame with 3 rows and 3 columns
## i dna gr
## <integer> <DNAStringSet> <GRanges>
## 1 1 AAA chr1:20-30:+
## 2 2 CCC chr1:40-50:+
## 3 3 GGG chr1:45-55:+
snp <- GRanges(c('chr1:23435','chr2:54323'))
Shift SNP locations (e.g. if another reference was used):
shift(snp, 1)
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 23436 *
## [2] chr2 54324 *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
Extend SNP ranges:
flank(snp, 10)
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 23425-23434 *
## [2] chr2 54313-54322 *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
How many of the exons contain SNPs inside, what is the overlap?
snp <- GRanges(c('chr1:15:+','chr1:25:+','chr1:35:+'))
countOverlaps(snp, exon)
## [1] 0 1 0
Viceversa:
countOverlaps(exon, snp)
## [1] 1 0 0
findOverlaps(snp, exon)
## Hits object with 1 hit and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 2 1
## -------
## queryLength: 3 / subjectLength: 3
snp[snp %over% exon]
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 25 +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0 ggbeeswarm_0.6.0
## [4] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.2
## [7] purrr_0.3.2 readr_1.3.1 tidyr_0.8.3
## [10] tibble_2.1.3 ggplot2_3.2.0 tidyverse_1.2.1
## [13] Biostrings_2.52.0 XVector_0.24.0 IRanges_2.18.1
## [16] S4Vectors_0.22.0 BiocGenerics_0.30.0
##
## loaded via a namespace (and not attached):
## [1] beeswarm_0.2.3 tidyselect_0.2.5 xfun_0.8
## [4] haven_2.1.0 lattice_0.20-38 colorspace_1.4-1
## [7] generics_0.0.2 htmltools_0.3.6 yaml_2.2.0
## [10] rlang_0.4.0 pillar_1.4.2 glue_1.3.1
## [13] withr_2.1.2 modelr_0.1.4 readxl_1.3.1
## [16] GenomeInfoDbData_1.2.1 zlibbioc_1.30.0 munsell_0.5.0
## [19] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.4
## [22] evaluate_0.14 knitr_1.23 vipor_0.4.5
## [25] broom_0.5.2 Rcpp_1.0.1 scales_1.0.0
## [28] backports_1.1.4 jsonlite_1.6 hms_0.4.2
## [31] digest_0.6.19 stringi_1.4.3 grid_3.6.0
## [34] bitops_1.0-6 cli_1.1.0 tools_3.6.0
## [37] magrittr_1.5 RCurl_1.95-4.12 lazyeval_0.2.2
## [40] crayon_1.3.4 pkgconfig_2.0.2 xml2_1.2.0
## [43] lubridate_1.7.4 assertthat_0.2.1 rmarkdown_1.13
## [46] httr_1.4.0 rstudioapi_0.10 R6_2.4.0
## [49] nlme_3.1-140 compiler_3.6.0
head(DNase)
## Grouped Data: density ~ conc | Run
## Run conc density
## 1 1 0.04882812 0.017
## 2 1 0.04882812 0.018
## 3 1 0.19531250 0.121
## 4 1 0.19531250 0.124
## 5 1 0.39062500 0.206
## 6 1 0.39062500 0.215
{
plot(DNase$conc, DNase$density,
xlab="conc", ylab="density",
col=DNase$Run, cex=3)
abline(v=DNase$conc, lwd=0.05, lty="dotted")
}
hist(DNase$conc)
boxplot(density ~ Run, data=DNase)
ggplot(data=DNase, aes(x=Run, y=density, color=Run)) + geom_boxplot()
ggplot(data=DNase, aes(x=conc, y=density, color=Run)) + geom_point(shape=1, size=2) +
geom_smooth(method="loess")
load("hiiragi2013.rda")
library(Biobase)
hiiragi2013
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 45101 features, 101 samples
## element names: exprs
## protocolData
## sampleNames: 1 E3.25 2 E3.25 ... 101 E4.5 (FGF4-KO) (101 total)
## varLabels: ScanDate
## varMetadata: labelDescription
## phenoData
## sampleNames: 1 E3.25 2 E3.25 ... 101 E4.5 (FGF4-KO) (101 total)
## varLabels: File.name Embryonic.day ... sampleColour (8 total)
## varMetadata: labelDescription
## featureData
## featureNames: 1415670_at 1415671_at ... AFFX-TrpnX-M_at (45101
## total)
## fvarLabels: symbol genename ensembl
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: mouse4302
dim(hiiragi2013)
## Features Samples
## 45101 101
Extract quantitative values
head(exprs(hiiragi2013))
## 1 E3.25 2 E3.25 3 E3.25 4 E3.25 5 E3.25 6 E3.25
## 1415670_at 4.910459 7.526672 6.956328 6.424048 5.105808 5.856685
## 1415671_at 9.768979 9.144228 9.295010 11.059831 9.376749 9.681017
## 1415672_at 10.411893 10.918942 9.495738 10.317996 11.143684 10.234943
## 1415673_at 5.618108 6.439416 6.730465 4.914527 5.619778 7.188673
## 1415674_a_at 7.541891 8.380285 8.480580 7.977363 8.650312 8.639637
## 1415675_at 8.590070 7.661697 8.741957 9.147643 8.868919 8.595630
## 7 E3.25 8 E3.25 9 E3.25 10 E3.25 11 E3.25 12 E3.25
## 1415670_at 5.059961 4.574661 8.123073 5.464257 6.378490 5.980889
## 1415671_at 7.665886 9.325743 9.724729 9.389818 10.741852 10.076715
## 1415672_at 10.642970 9.304958 11.037632 9.754123 10.574534 9.557437
## 1415673_at 6.395441 6.405085 6.542729 6.842668 4.638282 7.233038
## 1415674_a_at 7.645431 5.265520 6.748849 6.951920 8.436673 8.326323
## 1415675_at 8.266214 8.359522 8.896249 9.503661 8.972576 7.525839
## 13 E3.25 14 E3.25 15 E3.25 16 E3.25 17 E3.25 18 E3.25
## 1415670_at 8.535549 8.084051 8.466555 7.721368 8.043886 7.936210
## 1415671_at 10.457479 9.152969 9.846789 9.662910 9.719014 11.228105
## 1415672_at 9.903827 9.857872 9.448139 10.304187 9.791350 11.376457
## 1415673_at 6.570877 7.860524 6.508946 5.871037 3.420043 6.258020
## 1415674_a_at 6.637048 6.486028 5.965517 7.148297 8.682546 9.539372
## 1415675_at 6.908303 8.377355 8.857803 8.232332 8.736221 8.259772
## 19 E3.25 20 E3.25 21 E3.25 22 E3.25 23 E3.25 24 E3.25
## 1415670_at 8.372314 8.790075 7.699547 9.160169 9.078835 7.026946
## 1415671_at 11.097919 11.161601 11.484050 11.868276 11.073027 11.077749
## 1415672_at 11.325410 11.202199 11.079766 10.111392 10.758893 9.982358
## 1415673_at 7.040959 7.061794 7.767596 7.186408 5.480284 3.787961
## 1415674_a_at 8.242590 9.988684 9.263087 10.173450 8.572024 9.577342
## 1415675_at 8.078979 9.553001 9.608263 8.958113 9.229941 8.644070
## 25 E3.25 26 E3.25 27 E3.25 28 E3.25 29 E3.25 30 E3.25
## 1415670_at 9.481374 5.042345 10.060482 8.720720 7.640830 8.198603
## 1415671_at 11.648165 10.192303 11.588828 10.410176 10.848364 11.786923
## 1415672_at 10.845297 11.528406 10.822370 11.443782 10.777431 10.848957
## 1415673_at 6.547257 8.512493 7.591152 4.543854 6.596599 8.371666
## 1415674_a_at 9.494986 8.829271 9.548763 8.830112 7.970468 9.456158
## 1415675_at 8.982421 8.796370 7.805410 8.821485 8.694920 10.280600
## 31 E3.25 32 E3.25 33 E3.25 34 E3.25 35 E3.25 36 E3.25
## 1415670_at 8.706866 9.447069 8.296739 6.834025 8.541503 6.077360
## 1415671_at 10.700026 10.540521 11.572670 10.797511 11.134510 11.248140
## 1415672_at 11.252663 10.440958 11.210719 11.234910 10.464442 10.749835
## 1415673_at 8.056224 7.939600 6.608207 4.958611 4.800360 7.310682
## 1415674_a_at 8.359363 9.329402 10.167002 8.767243 8.556250 7.314833
## 1415675_at 9.559761 9.280311 8.273592 8.834519 7.971188 8.666798
## 37 E3.5 (PE) 38 E3.5 (PE) 39 E3.5 (EPI) 40 E3.5 (EPI)
## 1415670_at 8.273882 7.620525 7.866760 8.415855
## 1415671_at 9.344318 11.214708 10.018825 10.772849
## 1415672_at 8.263001 10.262658 10.455752 10.575892
## 1415673_at 4.733935 7.198648 7.192898 5.688193
## 1415674_a_at 7.658033 5.559332 8.075276 8.645646
## 1415675_at 8.864403 8.463336 8.232021 8.945480
## 41 E3.5 (PE) 42 E3.5 (EPI) 43 E3.5 (PE) 44 E3.5 (EPI)
## 1415670_at 10.069573 5.186645 8.277925 8.907035
## 1415671_at 10.995412 10.281639 10.322795 11.227185
## 1415672_at 10.434905 11.055458 10.326748 10.353424
## 1415673_at 7.873462 7.056827 7.628060 7.880329
## 1415674_a_at 8.924068 5.786938 8.090168 7.259978
## 1415675_at 9.264361 9.268250 5.406516 8.734146
## 45 E3.5 (EPI) 46 E3.5 (EPI) 47 E3.5 (EPI) 48 E3.5 (EPI)
## 1415670_at 8.037981 9.564671 8.413460 6.067820
## 1415671_at 10.039085 9.807397 10.763823 9.366529
## 1415672_at 10.670212 8.679185 11.023366 8.100330
## 1415673_at 7.678721 7.319281 6.180487 5.994487
## 1415674_a_at 4.957499 9.630059 6.087135 8.319604
## 1415675_at 9.492942 7.148993 8.243656 8.465048
## 49 E3.5 (PE) 50 E3.5 (PE) 51 E3.5 (PE) 52 E3.5 (PE)
## 1415670_at 7.283398 8.162521 8.915069 8.003773
## 1415671_at 10.404443 10.821639 10.302784 9.375657
## 1415672_at 9.640297 10.852335 10.524796 9.453517
## 1415673_at 9.593375 8.803020 7.272420 6.506133
## 1415674_a_at 9.804526 8.337384 8.640091 9.231093
## 1415675_at 8.572719 9.777869 8.585933 8.941711
## 53 E3.5 (PE) 54 E3.5 (PE) 55 E3.5 (EPI) 56 E3.5 (EPI)
## 1415670_at 8.504471 9.230439 7.280949 7.546062
## 1415671_at 8.878585 9.419837 10.289884 8.132641
## 1415672_at 10.326346 9.670100 10.294861 9.830191
## 1415673_at 7.222724 7.341542 5.511479 5.819504
## 1415674_a_at 9.021208 8.154534 8.516675 7.097983
## 1415675_at 10.095101 9.219913 8.635134 9.563152
## 57 E3.5 (EPI) 58 E3.5 (PE) 59 E4.5 (PE) 60 E4.5 (EPI)
## 1415670_at 7.609851 7.600535 9.503289 8.657604
## 1415671_at 9.193082 10.539076 11.458339 8.824234
## 1415672_at 11.286201 10.426612 6.699087 9.496283
## 1415673_at 8.123130 6.496563 6.765473 8.776755
## 1415674_a_at 8.324338 6.429535 9.974538 9.098873
## 1415675_at 8.293887 9.680122 9.294144 9.221649
## 61 E4.5 (PE) 62 E4.5 (EPI) 63 E4.5 (EPI) 64 E4.5 (EPI)
## 1415670_at 9.373941 7.420272 8.363999 10.516034
## 1415671_at 11.336816 9.386655 8.589102 9.612615
## 1415672_at 10.131304 8.756391 9.728199 8.839825
## 1415673_at 9.114517 6.190662 5.844180 6.814642
## 1415674_a_at 8.894763 9.312049 8.043095 8.597153
## 1415675_at 9.983117 9.027379 7.846470 7.746031
## 65 E4.5 (PE) 66 E4.5 (PE) 67 E3.25 (FGF4-KO)
## 1415670_at 10.233514 9.086783 6.985313
## 1415671_at 10.518523 10.833673 10.615071
## 1415672_at 10.099166 9.295808 9.754342
## 1415673_at 9.513386 9.252811 5.287788
## 1415674_a_at 6.982438 8.848143 9.092702
## 1415675_at 9.453644 9.127367 8.901534
## 68 E3.25 (FGF4-KO) 69 E3.25 (FGF4-KO) 70 E3.25 (FGF4-KO)
## 1415670_at 5.814319 7.690732 8.075658
## 1415671_at 10.379045 10.567961 10.715209
## 1415672_at 7.039647 8.955027 9.063367
## 1415673_at 6.039571 5.684059 5.794135
## 1415674_a_at 7.039116 8.084398 7.926238
## 1415675_at 7.648410 7.952806 8.899669
## 71 E3.25 (FGF4-KO) 72 E3.25 (FGF4-KO) 73 E3.25 (FGF4-KO)
## 1415670_at 8.267528 5.180107 7.199486
## 1415671_at 10.131859 10.250684 10.576188
## 1415672_at 10.177928 9.551955 8.138070
## 1415673_at 7.735918 6.620887 6.392453
## 1415674_a_at 9.796577 9.031277 8.916845
## 1415675_at 8.878228 9.071759 7.943296
## 74 E3.25 (FGF4-KO) 75 E3.25 (FGF4-KO) 76 E3.25 (FGF4-KO)
## 1415670_at 6.384739 8.767213 5.770023
## 1415671_at 10.094252 11.502889 10.000846
## 1415672_at 7.661539 9.856569 3.375618
## 1415673_at 5.538817 6.186746 4.806714
## 1415674_a_at 8.861186 9.431775 9.454379
## 1415675_at 7.319638 9.263623 8.817041
## 77 E3.25 (FGF4-KO) 78 E3.25 (FGF4-KO) 79 E3.25 (FGF4-KO)
## 1415670_at 6.675188 9.397408 10.440421
## 1415671_at 11.100101 10.160398 11.297621
## 1415672_at 8.726354 10.674683 10.649906
## 1415673_at 4.490444 8.232427 6.235927
## 1415674_a_at 8.552819 7.264563 9.667478
## 1415675_at 8.675685 9.566866 8.457816
## 80 E3.25 (FGF4-KO) 81 E3.25 (FGF4-KO) 82 E3.25 (FGF4-KO)
## 1415670_at 7.506113 9.537808 9.626242
## 1415671_at 11.336896 11.714058 11.966809
## 1415672_at 10.283193 10.965307 6.882138
## 1415673_at 10.018832 9.438075 5.373948
## 1415674_a_at 9.453293 9.279124 8.096551
## 1415675_at 9.221731 8.961749 8.976936
## 83 E3.25 (FGF4-KO) 84 E3.5 (FGF4-KO) 85 E3.5 (FGF4-KO)
## 1415670_at 9.082400 8.410321 8.182409
## 1415671_at 11.750857 11.390676 11.009250
## 1415672_at 9.453529 11.097374 11.176322
## 1415673_at 8.095055 5.609975 6.732978
## 1415674_a_at 9.287606 10.106055 8.930789
## 1415675_at 8.318733 9.556829 8.767664
## 86 E3.5 (FGF4-KO) 87 E3.5 (FGF4-KO) 88 E3.5 (FGF4-KO)
## 1415670_at 8.084051 7.178949 8.394489
## 1415671_at 11.369503 11.531595 11.308553
## 1415672_at 10.426476 9.838740 11.079161
## 1415673_at 8.229005 7.149751 7.641474
## 1415674_a_at 7.324169 8.711869 9.361509
## 1415675_at 8.708365 8.887450 9.011559
## 89 E3.5 (FGF4-KO) 90 E3.5 (FGF4-KO) 91 E3.5 (FGF4-KO)
## 1415670_at 8.410588 9.306985 8.157402
## 1415671_at 11.267228 11.779960 11.623497
## 1415672_at 9.704876 10.726146 10.753831
## 1415673_at 6.607004 7.004673 8.000327
## 1415674_a_at 9.404744 7.730486 9.044348
## 1415675_at 8.845187 7.243588 9.564489
## 92 E4.5 (FGF4-KO) 93 E4.5 (FGF4-KO) 94 E4.5 (FGF4-KO)
## 1415670_at 8.359740 9.575871 6.179409
## 1415671_at 11.175453 9.591774 9.518947
## 1415672_at 10.885286 9.339838 9.370398
## 1415673_at 4.957310 5.652768 6.910332
## 1415674_a_at 8.592410 5.954694 8.159053
## 1415675_at 6.257513 8.496400 7.931365
## 95 E4.5 (FGF4-KO) 96 E4.5 (FGF4-KO) 97 E4.5 (FGF4-KO)
## 1415670_at 8.601108 8.051324 4.911292
## 1415671_at 10.526221 10.770072 6.375060
## 1415672_at 7.787682 10.084626 8.773603
## 1415673_at 6.694354 6.104253 5.313348
## 1415674_a_at 9.041147 8.667740 8.028946
## 1415675_at 6.235425 9.763872 9.293012
## 98 E4.5 (FGF4-KO) 99 E4.5 (FGF4-KO) 100 E4.5 (FGF4-KO)
## 1415670_at 3.801598 8.360182 7.999680
## 1415671_at 11.130234 8.052922 7.425286
## 1415672_at 10.768072 10.327700 7.729140
## 1415673_at 4.872900 8.932054 5.028091
## 1415674_a_at 5.790260 9.019782 5.045660
## 1415675_at 8.688155 7.368104 7.122485
## 101 E4.5 (FGF4-KO)
## 1415670_at 8.193070
## 1415671_at 9.148713
## 1415672_at 10.283193
## 1415673_at 4.505772
## 1415674_a_at 8.652528
## 1415675_at 8.672924
Extract sample annotations
# View(pData(hiiragi2013))
dftx = data.frame(t(exprs(hiiragi2013)), pData(hiiragi2013))
ggplot(dftx, aes(x=X1426642_at, y = X1418765_at)) +
geom_point(aes(colour=sampleColour)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
selectedProbes <- c(Fgf4 = "1420085_at", Gata4 = "1418863_at",
Gata6 = "1425463_at", Sox2 = "1416967_at")
library(dplyr)
library(tidyverse)
tmp <- data.frame(t(exprs(hiiragi2013[selectedProbes, ])))
names(tmp) <- names(selectedProbes)
tmp$sample <- rownames(tmp)
Wide to long format
genes <- gather(tmp, "gene","expression", -sample)
head(genes)
## sample gene expression
## 1 1 E3.25 Fgf4 3.027715
## 2 2 E3.25 Fgf4 9.293016
## 3 3 E3.25 Fgf4 2.940142
## 4 4 E3.25 Fgf4 9.715243
## 5 5 E3.25 Fgf4 8.924228
## 6 6 E3.25 Fgf4 11.325952
genes %>%
filter(gene == "Gata4") %>%
ggplot(aes(x = expression)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p <- ggplot(genes, aes(x = gene, y = expression, fill = gene))
bxplot <- p + geom_boxplot()
bxplot
p <- ggplot(genes, aes(x = gene, y = expression, fill = gene))
bxplot <- p + geom_violin()
bxplot
jtrplot <- p +
geom_jitter(aes(colour = gene)) +
theme(legend.position = "none")
jtrplot
dotplot <- p + geom_dotplot(binaxis = "y", binwidth = 1/6,
stackdir = "center", stackratio = 0.75,
aes(color = gene)) +
theme(legend.position = "none")
dotplot
Beeplot: to avoid overlapping points
beeplot <- p + geom_beeswarm(aes(color = gene)) +
theme(legend.position = "none")
beeplot
# jtrplot + dotplot + beeplot
dfx <- as.data.frame(Biobase::exprs(hiiragi2013))
scp <- ggplot(dfx, aes(x=`59 E4.5 (PE)`, y=`92 E4.5 (FGF4-KO)`)) + geom_point()
scp
scp <- ggplot(dfx, aes(x=`59 E4.5 (PE)`, y=`92 E4.5 (FGF4-KO)`)) + geom_point(alpha=0.1)
scp
scp <- ggplot(dfx, aes(x=`59 E4.5 (PE)`, y=`92 E4.5 (FGF4-KO)`)) + geom_density2d(h=0.5, bins=60)
scp
scp <- ggplot(dfx, aes(x=`59 E4.5 (PE)`, y=`92 E4.5 (FGF4-KO)`)) + geom_density2d()
scp
scp <- ggplot(dfx, aes(x=`59 E4.5 (PE)`, y=`92 E4.5 (FGF4-KO)`)) + geom_hex()
scp
Useful to check annotations of data points
library("plotly")
scp <- ggplot(dfx[1:100, ],
aes(x= `59 E4.5 (PE)`, y = `92 E4.5 (FGF4-KO)`))
scp2 <- scp + geom_point()
ggplotly(scp2)
library(SummarizedExperiment)
library(airway)
set.seed(123)
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
Rows: Ensemble genes columns: samples
dim(airway)
## [1] 64102 8
dim(assay(airway))
## [1] 64102 8
nrow(colData(airway))
## [1] 8
dplyr::glimpse( dimnames(airway) )
## List of 2
## $ : chr [1:64102] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" ...
## $ : chr [1:8] "SRR1039508" "SRR1039509" "SRR1039512" "SRR1039513" ...
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
No rowdata:
rowData(airway)
## DataFrame with 64102 rows and 0 columns
rowRanges: * 1 entry per row * contains info about the exons
rowRanges(airway)
## GRangesList object of length 64102:
## $ENSG00000000003
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X 99883667-99884983 - | 667145 ENSE00001459322
## [2] X 99885756-99885863 - | 667146 ENSE00000868868
## [3] X 99887482-99887565 - | 667147 ENSE00000401072
## [4] X 99887538-99887565 - | 667148 ENSE00001849132
## [5] X 99888402-99888536 - | 667149 ENSE00003554016
## ... ... ... ... . ... ...
## [13] X 99890555-99890743 - | 667156 ENSE00003512331
## [14] X 99891188-99891686 - | 667158 ENSE00001886883
## [15] X 99891605-99891803 - | 667159 ENSE00001855382
## [16] X 99891790-99892101 - | 667160 ENSE00001863395
## [17] X 99894942-99894988 - | 667161 ENSE00001828996
##
## ...
## <64101 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
Metadata function
metadata(airway)
## [[1]]
## Experiment data
## Experimenter name: Himes BE
## Laboratory: NA
## Contact information:
## Title: RNA-Seq transcriptome profiling identifies CRISPLD2 as a glucocorticoid responsive gene that modulates cytokine function in airway smooth muscle cells.
## URL: http://www.ncbi.nlm.nih.gov/pubmed/24926665
## PMIDs: 24926665
##
## Abstract: A 226 word abstract is available. Use 'abstract' method.
message(class(assay(airway)), dim(assay(airway)))
## matrix641028
head(assay(airway))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
matrix: all elements must be of the same type
m <- matrix(
rnorm(12), nrow = 4, ncol = 3,
dimnames = list(letters[1:4], LETTERS[1:3])
)
m
## A B C
## a -0.56047565 0.1292877 -0.6868529
## b -0.23017749 1.7150650 -0.4456620
## c 1.55870831 0.4609162 1.2240818
## d 0.07050839 -1.2650612 0.3598138
dim(m)
## [1] 4 3
dimnames(m)
## [[1]]
## [1] "a" "b" "c" "d"
##
## [[2]]
## [1] "A" "B" "C"
m[,1,drop=FALSE]
## A
## a -0.56047565
## b -0.23017749
## c 1.55870831
## d 0.07050839
SummarizedExperiment has all the matrix-like interface functions incorporated
subset <- airway[1:5,1:4]
assay(subset)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513
## ENSG00000000003 679 448 873 408
## ENSG00000000005 0 0 0 0
## ENSG00000000419 467 515 621 365
## ENSG00000000457 260 211 263 164
## ENSG00000000460 60 55 40 35
colData(subset)
## DataFrame with 4 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
rowRanges(subset)
## GRangesList object of length 5:
## $ENSG00000000003
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X 99883667-99884983 - | 667145 ENSE00001459322
## [2] X 99885756-99885863 - | 667146 ENSE00000868868
## [3] X 99887482-99887565 - | 667147 ENSE00000401072
## [4] X 99887538-99887565 - | 667148 ENSE00001849132
## [5] X 99888402-99888536 - | 667149 ENSE00003554016
## ... ... ... ... . ... ...
## [13] X 99890555-99890743 - | 667156 ENSE00003512331
## [14] X 99891188-99891686 - | 667158 ENSE00001886883
## [15] X 99891605-99891803 - | 667159 ENSE00001855382
## [16] X 99891790-99892101 - | 667160 ENSE00001863395
## [17] X 99894942-99894988 - | 667161 ENSE00001828996
##
## ...
## <4 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
rowSums(m)
## a b c d
## -1.118041 1.039226 3.243706 -0.834739
colSums(m)
## A B C
## 0.8385636 1.0402077 0.4513808
Overall sample counts quantification * Representation of technical variation between samples
colSums(assay(airway))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## 20637971 18809481 25348649 15163415 24448408 30818215
## SRR1039520 SRR1039521
## 19126151 21164133
genemeans <- rowMeans(log(assay(airway) + 1, base = exp(1)))
ggplot(data=data.frame(genemeans),
aes(x=genemeans)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
airway$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: trt untrt
Exercise:
colData(airway[,airway$dex == 'untrt'])
## DataFrame with 4 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039520 SRX384357 SRS508579 SAMN02422683
nullgenes <- rowSums(assay(airway)) == 0
airwaynonull <- airway[!nullgenes,,drop=FALSE]
table(nullgenes)
## nullgenes
## FALSE TRUE
## 33469 30633
genemeans <- rowMeans(log(assay(airwaynonull) + 1, base = exp(1)))
ggplot(data=data.frame(genemeans),
aes(x=genemeans)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Endomorphisms of list: subsetting them or applying functions will yield another list as output. Except for double squared brackets
a <- list(v=seq(1,5), names = c('manola','carmona'))
lengths(a)
## v names
## 5 2
seqinfo contains info of genome annotation from the genomic data * circular, sequence lengths, genome
r <- rowRanges(airway)
seqinfo(r)
## Seqinfo object with 722 sequences (1 circular) from an unspecified genome:
## seqnames seqlengths isCircular genome
## 1 249250621 FALSE <NA>
## 2 243199373 FALSE <NA>
## 3 198022430 FALSE <NA>
## 4 191154276 FALSE <NA>
## 5 180915260 FALSE <NA>
## ... ... ... ...
## LRG_94 12428 FALSE <NA>
## LRG_96 93210 FALSE <NA>
## LRG_97 25996 FALSE <NA>
## LRG_98 18750 FALSE <NA>
## LRG_99 13294 FALSE <NA>
Annotation data can sometimes be accessed from browseVignetteS:
browseVignettes("airway")
## starting httpd help server ... done
library(dplyr)
seqinfo(r) %>% as('GRanges') %>% subset(seqnames == 14) -> chr14
chr14
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 14 14 1-107349540 *
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
Over = overlaps
rowRanges(airway)[rowRanges(airway) %over% chr14]
## GRangesList object of length 2244:
## $ENSG00000006432
## GRanges object with 26 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] 14 71189243-71197581 - | 453195 ENSE00002597852
## [2] 14 71196368-71197581 - | 453196 ENSE00002485643
## [3] 14 71196383-71197581 - | 453197 ENSE00002510454
## [4] 14 71196653-71197581 - | 453198 ENSE00002249042
## [5] 14 71196852-71197581 - | 453199 ENSE00001518020
## ... ... ... ... . ... ...
## [22] 14 71249940-71250162 - | 453216 ENSE00002524190
## [23] 14 71267384-71267797 - | 453217 ENSE00001137145
## [24] 14 71275483-71275888 - | 453218 ENSE00001518024
## [25] 14 71275483-71276223 - | 453219 ENSE00002468641
## [26] 14 71275483-71276251 - | 453220 ENSE00002511719
##
## ...
## <2243 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
ridx <- rowRanges(airway) %over% chr14
airway[ridx,]
## class: RangedSummarizedExperiment
## dim: 2244 8
## metadata(1): ''
## assays(1): counts
## rownames(2244): ENSG00000006432 ENSG00000009830 ...
## ENSG00000273259 ENSG00000273307
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
#TODO
Look for packages with org in their name
BiocManager::available("^org\\.")
## [1] "org.Ag.eg.db" "org.At.tair.db" "org.Bt.eg.db"
## [4] "org.Ce.eg.db" "org.Cf.eg.db" "org.Dm.eg.db"
## [7] "org.Dr.eg.db" "org.EcK12.eg.db" "org.EcSakai.eg.db"
## [10] "org.Gg.eg.db" "org.Hs.eg.db" "org.Mm.eg.db"
## [13] "org.Mmu.eg.db" "org.Pf.plasmo.db" "org.Pt.eg.db"
## [16] "org.Rn.eg.db" "org.Sc.sgd.db" "org.Ss.eg.db"
## [19] "org.Xl.eg.db"
# BiocManager::install("org.Hs.eg.db")
library(org.Hs.eg.db)
library(AnnotationHub)
org <- org.Hs.eg.db
keytypes availabel for org?
keytypes(org)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
Things that we can map to
columns(org)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT"
## [5] "ENSEMBLTRANS" "ENTREZID" "ENZYME" "EVIDENCE"
## [9] "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY"
## [17] "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
Mappings can be done via: * 1 to 1 mapping * 1 to many
ensid <- sample(rownames(airwaynonull), 10)
mapIdsmapIds(org, ensid, "SYMBOL", "ENSEMBL") %>% tibble::enframe("Ensemble","Symbol")
## 'select()' returned 1:1 mapping between keys and columns
## # A tibble: 10 x 2
## Ensemble Symbol
## <chr> <chr>
## 1 ENSG00000267086 <NA>
## 2 ENSG00000184260 HIST2H2AC
## 3 ENSG00000189280 GJB5
## 4 ENSG00000253121 <NA>
## 5 ENSG00000237857 <NA>
## 6 ENSG00000154451 GBP5
## 7 ENSG00000264201 MIR4701
## 8 ENSG00000259098 <NA>
## 9 ENSG00000144369 FAM171B
## 10 ENSG00000180658 OR2A4
mapIds(org, ensid, "GENENAME", "ENSEMBL") %>% tibble::enframe("GeneName","ENSEMBL")
## 'select()' returned 1:1 mapping between keys and columns
## # A tibble: 10 x 2
## GeneName ENSEMBL
## <chr> <chr>
## 1 ENSG00000267086 <NA>
## 2 ENSG00000184260 histone cluster 2 H2A family member c
## 3 ENSG00000189280 gap junction protein beta 5
## 4 ENSG00000253121 <NA>
## 5 ENSG00000237857 <NA>
## 6 ENSG00000154451 guanylate binding protein 5
## 7 ENSG00000264201 microRNA 4701
## 8 ENSG00000259098 <NA>
## 9 ENSG00000144369 family with sequence similarity 171 member B
## 10 ENSG00000180658 olfactory receptor family 2 subfamily A member 4
Multiple mappings:
AnnotationDbi::select(org, ensid, c("ENTREZID", "SYMBOL"), "ENSEMBL")
## 'select()' returned 1:1 mapping between keys and columns
## ENSEMBL ENTREZID SYMBOL
## 1 ENSG00000267086 <NA> <NA>
## 2 ENSG00000184260 8338 HIST2H2AC
## 3 ENSG00000189280 2709 GJB5
## 4 ENSG00000253121 <NA> <NA>
## 5 ENSG00000237857 <NA> <NA>
## 6 ENSG00000154451 115362 GBP5
## 7 ENSG00000264201 100616262 MIR4701
## 8 ENSG00000259098 <NA> <NA>
## 9 ENSG00000144369 165215 FAM171B
## 10 ENSG00000180658 79541 OR2A4
AnnotationDbi::select(org, ensid, c("GO"), "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
## ENSEMBL GO EVIDENCE ONTOLOGY
## 1 ENSG00000267086 <NA> <NA> <NA>
## 2 ENSG00000184260 GO:0000786 IEA CC
## 3 ENSG00000184260 GO:0003674 ND MF
## 4 ENSG00000184260 GO:0005634 HDA CC
## 5 ENSG00000184260 GO:0005634 IDA CC
## 6 ENSG00000184260 GO:0008150 ND BP
## 7 ENSG00000184260 GO:0046982 IEA MF
## 8 ENSG00000184260 GO:0070062 HDA CC
## 9 ENSG00000189280 GO:0005922 IEA CC
## 10 ENSG00000189280 GO:0007154 IEA BP
## 11 ENSG00000189280 GO:0008544 TAS BP
## 12 ENSG00000189280 GO:0016021 IEA CC
## 13 ENSG00000189280 GO:0060707 IEA BP
## 14 ENSG00000189280 GO:0060708 IEA BP
## 15 ENSG00000189280 GO:0060713 IEA BP
## 16 ENSG00000189280 GO:1905867 IEA BP
## 17 ENSG00000253121 <NA> <NA> <NA>
## 18 ENSG00000237857 <NA> <NA> <NA>
## 19 ENSG00000154451 GO:0000139 IEA CC
## 20 ENSG00000154451 GO:0003924 IEA MF
## 21 ENSG00000154451 GO:0005515 IPI MF
## 22 ENSG00000154451 GO:0005525 IEA MF
## 23 ENSG00000154451 GO:0005737 IDA CC
## 24 ENSG00000154451 GO:0005794 IDA CC
## 25 ENSG00000154451 GO:0006954 IEA BP
## 26 ENSG00000154451 GO:0009617 IEA BP
## 27 ENSG00000154451 GO:0016020 HDA CC
## 28 ENSG00000154451 GO:0031410 IEA CC
## 29 ENSG00000154451 GO:0034067 IDA BP
## 30 ENSG00000154451 GO:0042802 IPI MF
## 31 ENSG00000154451 GO:0042803 IDA MF
## 32 ENSG00000154451 GO:0045089 ISS BP
## 33 ENSG00000154451 GO:0048471 IDA CC
## 34 ENSG00000154451 GO:0050702 IMP BP
## 35 ENSG00000154451 GO:0051289 IDA BP
## 36 ENSG00000154451 GO:0071346 IEP BP
## 37 ENSG00000154451 GO:0072616 ISS BP
## 38 ENSG00000154451 GO:1900017 ISS BP
## 39 ENSG00000154451 GO:1900227 IDA BP
## 40 ENSG00000264201 <NA> <NA> <NA>
## 41 ENSG00000259098 <NA> <NA> <NA>
## 42 ENSG00000144369 GO:0016021 IEA CC
## 43 ENSG00000180658 GO:0003674 ND MF
## 44 ENSG00000180658 GO:0004930 IEA MF
## 45 ENSG00000180658 GO:0004984 IEA MF
## 46 ENSG00000180658 GO:0005886 TAS CC
## 47 ENSG00000180658 GO:0016021 IEA CC
## 48 ENSG00000180658 GO:0032154 IDA CC
## 49 ENSG00000180658 GO:0032467 IMP BP
## 50 ENSG00000180658 GO:0032956 IMP BP
## 51 ENSG00000180658 GO:0050911 IEA BP
## 52 ENSG00000180658 GO:0055037 IDA CC
## 53 ENSG00000180658 GO:0090543 IDA CC
## 54 ENSG00000180658 GO:0097431 IDA CC
## 55 ENSG00000180658 GO:1990023 IDA CC
goid <- select(org, ensid, "GO", "ENSEMBL") %>% as_tibble()
## 'select()' returned 1:many mapping between keys and columns
head(goid)
## # A tibble: 6 x 4
## ENSEMBL GO EVIDENCE ONTOLOGY
## <chr> <chr> <chr> <chr>
## 1 ENSG00000267086 <NA> <NA> <NA>
## 2 ENSG00000184260 GO:0000786 IEA CC
## 3 ENSG00000184260 GO:0003674 ND MF
## 4 ENSG00000184260 GO:0005634 HDA CC
## 5 ENSG00000184260 GO:0005634 IDA CC
## 6 ENSG00000184260 GO:0008150 ND BP
library(GO.db)
##
columns(GO.db)
## [1] "DEFINITION" "GOID" "ONTOLOGY" "TERM"
keytypes(GO.db)
## [1] "DEFINITION" "GOID" "ONTOLOGY" "TERM"
gid <- unique(head(goid$GO))
gid
## [1] NA "GO:0000786" "GO:0003674" "GO:0005634" "GO:0008150"
AnnotationDbi::select(GO.db, gid, "TERM", "GOID")
## 'select()' returned 1:1 mapping between keys and columns
## GOID TERM
## 1 <NA> <NA>
## 2 GO:0000786 nucleosome
## 3 GO:0003674 molecular_function
## 4 GO:0005634 nucleus
## 5 GO:0008150 biological_process
Good alternative: annotation hub (web service) –> database
library(AnnotationHub)
beware if there are multiple versions of the same thing (check metadata columns)
hub <- AnnotationHub()
## snapshotDate(): 2019-05-02
query(hub, "^org\\.")
## AnnotationHub with 1710 records
## # snapshotDate(): 2019-05-02
## # $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Escherichia coli, 'Chlorella vulgaris'_C-169, 'Klebsiella a...
## # $rdataclass: OrgDb
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH70563"]]'
##
## title
## AH70563 | org.Ag.eg.db.sqlite
## AH70564 | org.At.tair.db.sqlite
## AH70565 | org.Bt.eg.db.sqlite
## AH70566 | org.Cf.eg.db.sqlite
## AH70567 | org.Gg.eg.db.sqlite
## ... ...
## AH73812 | org.Plasmodium_vivax.eg.sqlite
## AH73813 | org.Burkholderia_mallei_ATCC_23344.eg.sqlite
## AH73814 | org.Bacillus_cereus_(strain_ATCC_14579_|_DSM_31).eg.sqlite
## AH73815 | org.Bacillus_cereus_ATCC_14579.eg.sqlite
## AH73816 | org.Schizosaccharomyces_cryophilus_OY26.eg.sqlite
hub["AH70564"]
## AnnotationHub with 1 record
## # snapshotDate(): 2019-05-02
## # names(): AH70564
## # $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Arabidopsis thaliana
## # $rdataclass: OrgDb
## # $rdatadateadded: 2019-04-29
## # $title: org.At.tair.db.sqlite
## # $description: NCBI gene ID based annotations about Arabidopsis thaliana
## # $taxonomyid: 3702
## # $genome: NCBI genomes
## # $sourcetype: NCBI/ensembl
## # $sourceurl: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/, ftp://ftp.ensembl....
## # $sourcesize: NA
## # $tags: c("NCBI", "Gene", "Annotation")
## # retrieve record with 'object[["AH70564"]]'
atorg <- hub[["AH70564"]]
## downloading 0 resources
## loading from cache
## 'AH70564 : 77310'
zoo <- hub["AH70564"]
Look for available packages
BiocManager::available("^TxDb\\.Hsap") %>% tibble::enframe(name=NULL)
## # A tibble: 5 x 1
## value
## <chr>
## 1 TxDb.Hsapiens.BioMart.igis
## 2 TxDb.Hsapiens.UCSC.hg18.knownGene
## 3 TxDb.Hsapiens.UCSC.hg19.knownGene
## 4 TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts
## 5 TxDb.Hsapiens.UCSC.hg38.knownGene
query(hub, "^TxDb\\.Hsap")
## AnnotationHub with 6 records
## # snapshotDate(): 2019-05-02
## # $dataprovider: UCSC
## # $species: Homo sapiens
## # $rdataclass: TxDb
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH52256"]]'
##
## title
## AH52256 | TxDb.Hsapiens.BioMart.igis.sqlite
## AH52257 | TxDb.Hsapiens.UCSC.hg18.knownGene.sqlite
## AH52258 | TxDb.Hsapiens.UCSC.hg19.knownGene.sqlite
## AH52259 | TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts.sqlite
## AH52260 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
## AH70591 | TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
## Loading required package: GenomicFeatures
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
# Look for exons
ex <- exons(txdb)
ex
## GRanges object with 647025 ranges and 1 metadata column:
## seqnames ranges strand | exon_id
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 11869-12227 + | 1
## [2] chr1 12010-12057 + | 2
## [3] chr1 12179-12227 + | 3
## [4] chr1 12613-12697 + | 4
## [5] chr1 12613-12721 + | 5
## ... ... ... ... . ...
## [647021] chrUn_GL000220v1 155997-156149 + | 647021
## [647022] chrUn_KI270442v1 380608-380726 + | 647022
## [647023] chrUn_KI270442v1 217250-217401 - | 647023
## [647024] chrUn_KI270744v1 51009-51114 - | 647024
## [647025] chrUn_KI270750v1 148668-148843 + | 647025
## -------
## seqinfo: 595 sequences (1 circular) from hg38 genome
transcripts(txdb)
## GRanges object with 226811 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 11869-14409 + | 1
## [2] chr1 12010-13670 + | 2
## [3] chr1 29554-31097 + | 3
## [4] chr1 30267-31109 + | 4
## [5] chr1 30366-30503 + | 5
## ... ... ... ... . ...
## [226807] chrUn_GL000220v1 155997-156149 + | 226807
## [226808] chrUn_KI270442v1 380608-380726 + | 226808
## [226809] chrUn_KI270442v1 217250-217401 - | 226809
## [226810] chrUn_KI270744v1 51009-51114 - | 226810
## [226811] chrUn_KI270750v1 148668-148843 + | 226811
## tx_name
## <character>
## [1] ENST00000456328.2
## [2] ENST00000450305.2
## [3] ENST00000473358.1
## [4] ENST00000469289.1
## [5] ENST00000607096.1
## ... ...
## [226807] ENST00000619779.1
## [226808] ENST00000620265.1
## [226809] ENST00000611690.1
## [226810] ENST00000616830.1
## [226811] ENST00000612925.1
## -------
## seqinfo: 595 sequences (1 circular) from hg38 genome
exByTx = exonsBy(txdb, "tx")
exByTx
## GRangesList object of length 226811:
## $1
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 11869-12227 + | 1 <NA> 1
## [2] chr1 12613-12721 + | 5 <NA> 2
## [3] chr1 13221-14409 + | 8 <NA> 3
##
## $2
## GRanges object with 6 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 12010-12057 + | 2 <NA> 1
## [2] chr1 12179-12227 + | 3 <NA> 2
## [3] chr1 12613-12697 + | 4 <NA> 3
## [4] chr1 12975-13052 + | 6 <NA> 4
## [5] chr1 13221-13374 + | 7 <NA> 5
## [6] chr1 13453-13670 + | 9 <NA> 6
##
## $3
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 29554-30039 + | 10 <NA> 1
## [2] chr1 30564-30667 + | 13 <NA> 2
## [3] chr1 30976-31097 + | 14 <NA> 3
##
## ...
## <226808 more elements>
## -------
## seqinfo: 595 sequences (1 circular) from hg38 genome
which.max(lengths(exByTx))
## 31375
## 31375
Exons from the gene:
exByTx[31375]
## GRangesList object of length 1:
## $31375
## GRanges object with 363 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr2 178807212-178807423 - | 92853 <NA>
## [2] chr2 178804552-178804655 - | 92850 <NA>
## [3] chr2 178802138-178802341 - | 92847 <NA>
## [4] chr2 178800395-178800682 - | 92846 <NA>
## [5] chr2 178799825-178799910 - | 92845 <NA>
## ... ... ... ... . ... ...
## [359] chr2 178529960-178530116 - | 92476 <NA>
## [360] chr2 178528528-178529219 - | 92475 <NA>
## [361] chr2 178528274-178528427 - | 92474 <NA>
## [362] chr2 178527446-178527748 - | 92473 <NA>
## [363] chr2 178525989-178527307 - | 92471 <NA>
## exon_rank
## <integer>
## [1] 1
## [2] 2
## [3] 3
## [4] 4
## [5] 5
## ... ...
## [359] 359
## [360] 360
## [361] 361
## [362] 362
## [363] 363
##
## -------
## seqinfo: 595 sequences (1 circular) from hg38 genome
BiocManager::available("curate")
## [1] "curatedAdipoChIP" "curatedAdipoRNA"
## [3] "curatedBladderData" "curatedBreastData"
## [5] "curatedCRCData" "curatedMetagenomicData"
## [7] "curatedOvarianData" "curatedTCGAData"
Download data from TCGA
library(curatedTCGAData)
## Loading required package: MultiAssayExperiment
curatedTCGAData()
## Please see the list below for available cohorts and assays
## Available Cancer codes:
## ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH
## KIRC KIRP LAML LGG LIHC LUAD LUSC MESO OV PAAD PCPG
## PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM
## Available Data Types:
## CNACGH CNACGH_CGH_hg_244a
## CNACGH_CGH_hg_415k_g4124a CNASeq CNASNP
## CNVSNP GISTIC_AllByGene GISTIC_Peaks
## GISTIC_ThresholdedByGene Methylation
## Methylation_methyl27 Methylation_methyl450
## miRNAArray miRNASeqGene mRNAArray
## mRNAArray_huex mRNAArray_TX_g4502a
## mRNAArray_TX_g4502a_1
## mRNAArray_TX_ht_hg_u133a Mutation
## RNASeq2GeneNorm RNASeqGene RPPAArray
curatedTCGAData("BRCA")
## Title DispatchClass
## 31 BRCA_CNASeq-20160128 Rda
## 32 BRCA_CNASNP-20160128 Rda
## 33 BRCA_CNVSNP-20160128 Rda
## 35 BRCA_GISTIC_AllByGene-20160128 Rda
## 36 BRCA_GISTIC_Peaks-20160128 Rda
## 37 BRCA_GISTIC_ThresholdedByGene-20160128 Rda
## 39 BRCA_Methylation_methyl27-20160128_assays H5File
## 40 BRCA_Methylation_methyl27-20160128_se Rds
## 41 BRCA_Methylation_methyl450-20160128_assays H5File
## 42 BRCA_Methylation_methyl450-20160128_se Rds
## 43 BRCA_miRNASeqGene-20160128 Rda
## 44 BRCA_mRNAArray-20160128 Rda
## 45 BRCA_Mutation-20160128 Rda
## 46 BRCA_RNASeq2GeneNorm-20160128 Rda
## 47 BRCA_RNASeqGene-20160128 Rda
## 48 BRCA_RPPAArray-20160128 Rda
mae <- curatedTCGAData("BRCA", c("GISTIC_AllByGene", "RNASeq2GeneNorm"), dry.run=FALSE)
## snapshotDate(): 2019-04-29
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## 'EH588 : 588'
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## 'EH596 : 596'
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## 'EH587 : 587'
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## 'EH590 : 590'
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## 'EH599 : 599'
## harmonizing input:
## removing 12081 sampleMap rows not in names(experiments)
mae
## A MultiAssayExperiment object of 2 listed
## experiments with user-defined names and respective classes.
## Containing an ExperimentList class object of length 2:
## [1] BRCA_GISTIC_AllByGene-20160128: SummarizedExperiment with 24776 rows and 1080 columns
## [2] BRCA_RNASeq2GeneNorm-20160128: SummarizedExperiment with 20501 rows and 1212 columns
## Features:
## experiments() - obtain the ExperimentList instance
## colData() - the primary/phenotype DataFrame
## sampleMap() - the sample availability DataFrame
## `$`, `[`, `[[` - extract colData columns, subset, or experiment
## *Format() - convert into a long or wide DataFrame
## assays() - convert ExperimentList to a SimpleList of matrices
names(assays(mae))
## [1] "BRCA_GISTIC_AllByGene-20160128" "BRCA_RNASeq2GeneNorm-20160128"
mae[["BRCA_RNASeq2GeneNorm-20160128"]]
## class: SummarizedExperiment
## dim: 20501 1212
## metadata(0):
## assays(1): ''
## rownames(20501): A1BG A1CF ... psiTPTE22 tAKR
## rowData names(0):
## colnames(1212): TCGA-3C-AAAU-01A-11R-A41B-07
## TCGA-3C-AALI-01A-11R-A41B-07 ... TCGA-Z7-A8R5-01A-42R-A41B-07
## TCGA-Z7-A8R6-01A-11R-A41B-07
## colData names(0):
example: calculate distribution of library sizes
mae[["BRCA_RNASeq2GeneNorm-20160128"]] %>%
assay () %>%
colSums() %>%
hist()
biomart, etc
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] curatedTCGAData_1.6.0
## [2] MultiAssayExperiment_1.10.4
## [3] TxDb.Hsapiens.UCSC.hg38.knownGene_3.4.6
## [4] GenomicFeatures_1.36.3
## [5] GO.db_3.8.2
## [6] AnnotationHub_2.16.0
## [7] BiocFileCache_1.8.0
## [8] dbplyr_1.4.2
## [9] org.Hs.eg.db_3.8.2
## [10] AnnotationDbi_1.46.0
## [11] airway_1.4.0
## [12] SummarizedExperiment_1.14.0
## [13] DelayedArray_0.10.0
## [14] BiocParallel_1.18.0
## [15] matrixStats_0.54.0
## [16] plotly_4.9.0
## [17] Biobase_2.44.0
## [18] GenomicRanges_1.36.0
## [19] GenomeInfoDb_1.20.0
## [20] ggbeeswarm_0.6.0
## [21] forcats_0.4.0
## [22] stringr_1.4.0
## [23] dplyr_0.8.2
## [24] purrr_0.3.2
## [25] readr_1.3.1
## [26] tidyr_0.8.3
## [27] tibble_2.1.3
## [28] ggplot2_3.2.0
## [29] tidyverse_1.2.1
## [30] Biostrings_2.52.0
## [31] XVector_0.24.0
## [32] IRanges_2.18.1
## [33] S4Vectors_0.22.0
## [34] BiocGenerics_0.30.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rstudioapi_0.10
## [3] hexbin_1.27.3 bit64_0.9-7
## [5] interactiveDisplayBase_1.22.0 fansi_0.4.0
## [7] lubridate_1.7.4 xml2_1.2.0
## [9] knitr_1.23 zeallot_0.1.0
## [11] jsonlite_1.6 Rsamtools_2.0.0
## [13] broom_0.5.2 shiny_1.3.2
## [15] BiocManager_1.30.4 compiler_3.6.0
## [17] httr_1.4.0 backports_1.1.4
## [19] assertthat_0.2.1 Matrix_1.2-17
## [21] lazyeval_0.2.2 cli_1.1.0
## [23] later_0.8.0 htmltools_0.3.6
## [25] prettyunits_1.0.2 tools_3.6.0
## [27] gtable_0.3.0 glue_1.3.1
## [29] GenomeInfoDbData_1.2.1 rappdirs_0.3.1
## [31] Rcpp_1.0.1 cellranger_1.1.0
## [33] vctrs_0.1.0 nlme_3.1-140
## [35] ExperimentHub_1.10.0 rtracklayer_1.44.0
## [37] crosstalk_1.0.0 xfun_0.8
## [39] rvest_0.3.4 mime_0.7
## [41] XML_3.98-1.20 zlibbioc_1.30.0
## [43] MASS_7.3-51.4 scales_1.0.0
## [45] hms_0.4.2 promises_1.0.1
## [47] yaml_2.2.0 curl_3.3
## [49] memoise_1.1.0 biomaRt_2.40.1
## [51] stringi_1.4.3 RSQLite_2.1.1
## [53] rlang_0.4.0 pkgconfig_2.0.2
## [55] bitops_1.0-6 evaluate_0.14
## [57] lattice_0.20-38 GenomicAlignments_1.20.1
## [59] htmlwidgets_1.3 labeling_0.3
## [61] bit_1.1-14 tidyselect_0.2.5
## [63] magrittr_1.5 R6_2.4.0
## [65] generics_0.0.2 DBI_1.0.0
## [67] pillar_1.4.2 haven_2.1.0
## [69] withr_2.1.2 RCurl_1.95-4.12
## [71] modelr_0.1.4 crayon_1.3.4
## [73] utf8_1.1.4 rmarkdown_1.13
## [75] progress_1.2.2 grid_3.6.0
## [77] readxl_1.3.1 data.table_1.12.2
## [79] blob_1.1.1 digest_0.6.19
## [81] xtable_1.8-4 httpuv_1.5.1
## [83] munsell_0.5.0 beeswarm_0.2.3
## [85] viridisLite_0.3.0 vipor_0.4.5